#' ---
#' title: Link AcNet and DIME datasets
#' author: Joe Ornstein
#' date: 2025-06-20
#' version: 0.2
#' ---

library(tidyverse)
library(fuzzylink)
source('code/acnet.R') # functions to query Amicus Curiae Networks API

## Load and clean datasets ----------------------

# get organizations that cosigned Amicus briefs in 2012
filepath <- acnet.web.query(minRange = 2012, maxRange = 2012)
acnet <- jsonlite::fromJSON(readLines(filepath, warn=F))$data
orgname <- str_split(acnet$organizationNames, '\\|\\|') |> unlist()
orgID <- str_split(acnet$orgIDs, ',') |> unlist()

# put all unique cosigners in a dataframe
amicus <- data.frame(orgID, orgname) |>
  dplyr::distinct() |>
  mutate(orgname = str_to_upper(orgname))

# load Abi-Hassan et al. (2023) merge
load('raw/acnet_scores_5_13_22.rdata')

# merge with amicus
amicus <- amicus |>
  left_join(acnet_final |>
              mutate(orgname = str_to_upper(orgname)),
            by = c('orgID', 'orgname')) |>
  filter(!is.na(acnet_score))

# Load DIME (organizations only)
load('raw/dime_contributors_organizations_1979_2022.rdata')

# format names in all-caps
organizations <- organizations |>
  mutate(orgname = str_to_upper(most.recent.contributor.name))

## Exact matches --------------------

df_exact <- inner_join(amicus, organizations, by = 'orgname')

## fuzzylink: merge with organizations who are frequent contributors (num.distinct >= 8) ----------

bonica <- organizations |>
  filter(num.distinct >= 8)

rm(organizations)

model <- 'gpt-4o-2024-11-20'
fmla <- match ~ sim + jw

# approximately 6.5 hours and $10.05 API fees
df <- fuzzylink(amicus, bonica,
                by = 'orgname',
                record_type = 'organization or PAC',
                instructions = 'Misspellings, alternative names, and acronyms may be acceptable matches.',
                model = model)

save(df, df_exact, file = 'data/organizations-merge/bonica_fuzzylink.RData')
